## Loading required package: carData
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## The following object is masked from 'package:car':
##
## logit
## Loading required package: sysfonts
## Loading required package: showtextdb
library(readr) # 加载readr包
library(readxl) # 加载readxl包
# datafile <- read_excel(file.choose()) # 读取Excel文件
datafile <- read_excel("BMI.xlsx") # 读取Excel文件
# datafile <- read_csv("test.csv") # 读取Excel文件
datafile <- datafile %>%
rename(height = "身高",
weight = "体重",
gender = "性别",
attitude = "态度"
) # 对变量进行重命名## # A tibble: 6 × 6
## id gender height weight attitude BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 180 85 3 26.2
## 2 2 0 165 62 2 22.8
## 3 3 0 168 65 1 23.0
## 4 4 1 175 65 1 21.2
## 5 5 1 165 63 1 23.1
## 6 6 1 168 60 3 21.3
## tibble [14 × 6] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:14] 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : num [1:14] 1 0 0 1 1 1 1 1 1 0 ...
## $ height : num [1:14] 180 165 168 175 165 168 175 170 165 163 ...
## $ weight : num [1:14] 85 62 65 65 63 60 70 64 58 55 ...
## $ attitude: num [1:14] 3 2 1 1 1 3 3 3 2 2 ...
## $ BMI : num [1:14] 26.2 22.8 23 21.2 23.1 ...
## [1] 14 6
## [1] 14
## [1] 6
## $class
## [1] "tbl_df" "tbl" "data.frame"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
##
## $names
## [1] "id" "gender" "height" "weight" "attitude" "BMI"
## [1] "tbl_df" "tbl" "data.frame"
## [1] "id" "gender" "height" "weight" "attitude" "BMI"
## id gender height weight attitude
## Min. : 1.00 Min. :0.0 Min. :158.0 Min. :45.00 Min. :1.000
## 1st Qu.: 4.25 1st Qu.:0.0 1st Qu.:163.5 1st Qu.:52.75 1st Qu.:1.000
## Median : 7.50 Median :0.5 Median :165.5 Median :61.00 Median :2.000
## Mean : 7.50 Mean :0.5 Mean :167.0 Mean :60.36 Mean :1.857
## 3rd Qu.:10.75 3rd Qu.:1.0 3rd Qu.:169.5 3rd Qu.:64.75 3rd Qu.:2.750
## Max. :14.00 Max. :1.0 Max. :180.0 Max. :85.00 Max. :3.000
## BMI
## Min. :18.03
## 1st Qu.:20.41
## Median :21.28
## Mean :21.50
## 3rd Qu.:22.84
## Max. :26.23
## # A tibble: 14 × 6
## id gender height weight attitude BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 180 85 3 26.2
## 2 2 0 165 62 2 22.8
## 3 3 0 168 65 1 23.0
## 4 4 1 175 65 1 21.2
## 5 5 1 165 63 1 23.1
## 6 6 1 168 60 3 21.3
## 7 7 1 175 70 3 22.9
## 8 8 1 170 64 3 22.1
## 9 9 1 165 58 2 21.3
## 10 10 0 163 55 2 20.7
## 11 11 0 158 45 1 18.0
## 12 12 0 160 50 1 19.5
## 13 13 0 166 51 1 18.5
## 14 14 0 160 52 2 20.3
# 方法一
datafile_long <-
datafile %>%
pivot_longer(
cols = c(time1, time2),
names_to = "time",
values_to = "value"
) # 宽型转长型
# 方法二
# 示例数据:某实验前后测量的体重数据(单位:kg)
before <- c(70, 68, 75, 80, 72, 78, 74)
after <- c(68, 67, 73, 78, 71, 76, 73)
data <- data.frame(
subject = rep(1:length(before),2),
Group = rep(c("Before", "After"), each = length(before)),
Weight = c(before, after)
)
data## subject Group Weight
## 1 1 Before 70
## 2 2 Before 68
## 3 3 Before 75
## 4 4 Before 80
## 5 5 Before 72
## 6 6 Before 78
## 7 7 Before 74
## 8 1 After 68
## 9 2 After 67
## 10 3 After 73
## 11 4 After 78
## 12 5 After 71
## 13 6 After 76
## 14 7 After 73
# 方法三
set.seed(123)
data <- data.frame(
Subject = factor(1:10),
Time1 = rnorm(10, mean = 70, sd = 5),
Time2 = rnorm(10, mean = 75, sd = 5),
Time3 = rnorm(10, mean = 80, sd = 5)
)
# 将数据转换为长格式
long_data <- gather(data, key = "Time", value = "Score", Time1, Time2, Time3)
long_data## Subject Time Score
## 1 1 Time1 67.19762
## 2 2 Time1 68.84911
## 3 3 Time1 77.79354
## 4 4 Time1 70.35254
## 5 5 Time1 70.64644
## 6 6 Time1 78.57532
## 7 7 Time1 72.30458
## 8 8 Time1 63.67469
## 9 9 Time1 66.56574
## 10 10 Time1 67.77169
## 11 1 Time2 81.12041
## 12 2 Time2 76.79907
## 13 3 Time2 77.00386
## 14 4 Time2 75.55341
## 15 5 Time2 72.22079
## 16 6 Time2 83.93457
## 17 7 Time2 77.48925
## 18 8 Time2 65.16691
## 19 9 Time2 78.50678
## 20 10 Time2 72.63604
## 21 1 Time3 74.66088
## 22 2 Time3 78.91013
## 23 3 Time3 74.86998
## 24 4 Time3 76.35554
## 25 5 Time3 76.87480
## 26 6 Time3 71.56653
## 27 7 Time3 84.18894
## 28 8 Time3 80.76687
## 29 9 Time3 74.30932
## 30 10 Time3 86.26907
# 独立样本student's t test
t.result <-
t.test(
height ~ gender,
data = datafile,
var.equal = TRUE
)
t.result##
## Two Sample t-test
##
## data: height by gender
## t = -3.2339, df = 12, p-value = 0.007167
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -13.868165 -2.703263
## sample estimates:
## mean in group 0 mean in group 1
## 162.8571 171.1429
## List of 10
## $ statistic : Named num -3.23
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 12
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.00717
## $ conf.int : num [1:2] -13.9 -2.7
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 163 171
## ..- attr(*, "names")= chr [1:2] "mean in group 0" "mean in group 1"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means between group 0 and group 1"
## $ stderr : num 2.56
## $ alternative: chr "two.sided"
## $ method : chr " Two Sample t-test"
## $ data.name : chr "height by gender"
## - attr(*, "class")= chr "htest"
## [1] " Two Sample t-test"
## [1] "two.sided"
## difference in means between group 0 and group 1
## 0
## mean in group 0 mean in group 1
## 162.8571 171.1429
## t
## -3.233888
## df
## 12
## [1] 0.00716743
## [1] 2.562153
## [1] -13.86817
## [1] -2.703263
## [1] -13.868165 -2.703263
## attr(,"conf.level")
## [1] 0.95
##
## Welch Two Sample t-test
##
## data: height by gender
## t = -3.2339, df = 10.248, p-value = 0.00869
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -13.975874 -2.595555
## sample estimates:
## mean in group 0 mean in group 1
## 162.8571 171.1429
# Welch's t-test的具体计算
result <- datafile %>%
group_by(gender) %>%
summarise(
n = n(),
mean = mean(height, na.rm = TRUE),
sd = sd(height, na.rm = TRUE)
) # 按性别对身高进行描述性统计
mean(datafile$height[datafile$gender == 0])## [1] 162.8571
## [1] 171.1429
t.value <- (result[1,3] - result[2,3])/sqrt(result[1,4]^2/result[1,2] + result[2,4]^2/result[2,2]) # 计算t值
# 计算Welch-Satterthwaite自由度
df.value <- (result[1,4]^2/result[1,2] + result[2,4]^2/result[2,2])^2/((result[1,4]^2/result[1,2])^2/(result[1,2]-1) + (result[2,4]^2/result[2,2])^2/(result[2,2]-1)) # 计算自由度
# 计算p值
t.value <- as.numeric(t.value)
df.value <- as.numeric(df.value)
p.value <- 2*pt(-abs(t.value), df.value)
t.value## [1] -3.233888
## [1] 10.24801
## [1] 0.008689748
独立样本 t 检验 用于比较两个独立组的均值是否存在显著差异。这种检验适用于一个定量变量和一个二分类分组变量。
根据两组的方差是否相等,分为两种情况: 1. 方差相等:使用标准 t 检验。 2. 方差不相等:使用 Welch t 检验(更稳健)。
当方差不相等(Welch t 检验): \[ t = \frac{\bar{X}_1 - \bar{X}_2}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}} \]
自由度: 对于 Welch t 检验,自由度根据以下公式计算: \[ df = \frac{\left(\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}\right)^2}{\frac{\left(\frac{s_1^2}{n_1}\right)^2}{n_1 - 1} + \frac{\left(\frac{s_2^2}{n_2}\right)^2}{n_2 - 1}} \]
# 示例数据
data <- data.frame(
score = c(85, 90, 78, 92, 88, 76, 95, 89, 80, 84),
group = c("A", "A", "B", "B", "A", "B", "A", "A", "B", "B")
)
# 执行独立样本 t 检验
t_test_result <- t.test(score ~ group, data = data)
# 查看结果
print(t_test_result)##
## Welch Two Sample t-test
##
## data: score by group
## t = 2.2665, df = 6.3952, p-value = 0.06128
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
## -0.4709231 15.2709231
## sample estimates:
## mean in group A mean in group B
## 89.4 82.0
var.test()
的结果判断方差是否齐性,影响 t 检验的计算方式。var.test() 检验两组方差是否相等:var.equal = TRUE。alternative = "greater"。alternative = "less"。shapiro.test()
检验正态性:wilcox.test())。独立样本 t 检验是比较两组均值差异的重要工具。在 R 中,使用
t.test() 可快速实现: - 若方差齐性假设成立,可使用标准 t
检验。 - 若方差不齐性成立,默认使用 Welch t 检验。
在分析中,应根据数据的正态性、样本量和方差情况选择合适的检验方法。如不满足正态性假设,可用非参数检验替代。
##
## F test to compare two variances
##
## data: height by gender
## F = 0.41496, num df = 6, denom df = 6, p-value = 0.3086
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.07130127 2.41494298
## sample estimates:
## ratio of variances
## 0.414956
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.3698 0.2646
## 12
##
## Bartlett test of homogeneity of variances
##
## data: height by factor(gender)
## Bartlett's K-squared = 1.0384, df = 1, p-value = 0.3082
Bartlett’s Test 的原假设和备择假设如下:
检验统计量 \(T\) 的公式为:
\[ T = \frac{(N - k) \ln(S^2) - \sum_{i=1}^{k} (n_i - 1) \ln(S_i^2)}{1 + \frac{1}{3(k-1)} \left(\sum_{i=1}^{k} \frac{1}{n_i - 1} - \frac{1}{N - k}\right)} \]
其中:
\[ S^2 = \frac{\sum_{i=1}^{k} (n_i - 1) S_i^2}{N - k} \]
Bartlett’s Test 的检验统计量 \(T\) 近似服从 \(\chi^2\) 分布,自由度为 \(k - 1\)。
如果数据非正态,建议使用更稳健的方法,例如 Levene’s Test。
bartlett.test()R 提供了内置的 bartlett.test() 函数,用于执行 Bartlett’s
Test。
##
## Bartlett test of homogeneity of variances
##
## data: height by group
## Bartlett's K-squared = 0.50223, df = 3, p-value = 0.9184
运行 bartlett.test() 后,输出结果类似如下:
这意味着数据满足方差齐性的假设,可以继续执行单因素方差分析(ANOVA)。
方差齐性检验: 在执行单因素方差分析(ANOVA)之前,验证分组数据是否满足方差齐性假设。
组间方差比较: 检查多个实验组的方差是否显著不同。
| 特点 | Bartlett’s Test | Levene’s Test |
|---|---|---|
| 正态性假设 | 需要满足正态性 | 不需要严格满足正态性 |
| 稳健性 | 对非正态数据敏感 | 对非正态数据更稳健 |
| 用途 | 用于正态数据的方差齐性检验 | 用于正态和非正态数据的方差齐性检验 |
| 极端值的影响 | 容易受到极端值影响 | 对极端值更稳健 |
以下是完整的代码示例:
# 示例数据
data <- data.frame(
height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)
# 执行 Bartlett's Test
bartlett_result <- bartlett.test(height ~ group, data = data)
# 打印结果
print(bartlett_result)
# 检查数据正态性(以每组为单位)
by(data$height, data$group, shapiro.test)Levene’s Test 的假设如下:
Levene’s Test 的检验统计量 \(W\) 定义如下:
\[ W = \frac{(N - k)}{(k - 1)} \cdot \frac{\sum_{i=1}^k n_i (Z_{i.} - Z_{..})^2}{\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{i.})^2} \]
其中:
center 选用均值:\(\text{center}_i = \bar{Y}_i\);center 选用中位数:\(\text{center}_i =
\text{median}(Y_i)\)。leveneTest()leveneTest() 是 R 中 car
包提供的函数,用于执行 Levene’s Test。
# 示例数据
data <- data.frame(
height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)
# 如果尚未安装 car 包
# install.packages("car")
# 加载包
# library(car)## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 0.1667 0.9159
## 8
F value:检验统计量,表示组间偏差的差异程度。
Df:自由度,包括组间自由度(\(k - 1\))和组内自由度(\(N - k\))。
Pr(>F):p
值,用于判断是否拒绝原假设。leveneTest()
默认使用中位数作为中心计算偏差(center = "median"),适合非正态分布数据。
如果数据接近正态分布,可以选择均值作为中心:
单因素方差分析(ANOVA): 在 ANOVA 中,需要验证数据是否满足方差齐性假设。
处理方差不齐的情况: 如果方差不齐,可以使用更稳健的方法(如 Welch 方差分析):
# 示例数据
data <- data.frame(
height = c(160, 165, 170, 155, 150, 145, 175, 180, 165, 160, 150, 155),
group = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D")
)
# 安装并加载 car 包
install.packages("car")
library(car)
# 执行 Levene’s Test
levene_result <- leveneTest(height ~ group, data = data)
# 打印结果
print(levene_result)
# 以均值为中心计算
leveneTest(height ~ group, data = data, center = "mean")## Call:
## aov(formula = height ~ factor(attitude), data = datafile)
##
## Terms:
## factor(attitude) Residuals
## Sum of Squares 229.1667 286.8333
## Deg. of Freedom 2 11
##
## Residual standard error: 5.106443
## Estimated effects may be unbalanced
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(attitude) 2 229.2 114.58 4.394 0.0396 *
## Residuals 11 286.8 26.08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## List of 13
## $ coefficients : Named num [1:3] 165.33 -2.08 7.92
## ..- attr(*, "names")= chr [1:3] "(Intercept)" "factor(attitude)2" "factor(attitude)3"
## $ residuals : Named num [1:14] 6.75 1.75 2.667 9.667 -0.333 ...
## ..- attr(*, "names")= chr [1:14] "1" "2" "3" "4" ...
## $ effects : Named num [1:14] -624.86 8.87 12.26 7.94 -2.06 ...
## ..- attr(*, "names")= chr [1:14] "(Intercept)" "factor(attitude)2" "factor(attitude)3" "" ...
## $ rank : int 3
## $ fitted.values: Named num [1:14] 173 163 165 165 165 ...
## ..- attr(*, "names")= chr [1:14] "1" "2" "3" "4" ...
## $ assign : int [1:3] 0 1 1
## $ qr :List of 5
## ..$ qr : num [1:14, 1:3] -3.742 0.267 0.267 0.267 0.267 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:14] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:3] "(Intercept)" "factor(attitude)2" "factor(attitude)3"
## .. ..- attr(*, "assign")= int [1:3] 0 1 1
## .. ..- attr(*, "contrasts")=List of 1
## .. .. ..$ factor(attitude): chr "contr.treatment"
## ..$ qraux: num [1:3] 1.27 1.46 1.35
## ..$ pivot: int [1:3] 1 2 3
## ..$ tol : num 1e-07
## ..$ rank : int 3
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 11
## $ contrasts :List of 1
## ..$ factor(attitude): chr "contr.treatment"
## $ xlevels :List of 1
## ..$ factor(attitude): chr [1:3] "1" "2" "3"
## $ call : language aov(formula = height ~ factor(attitude), data = datafile)
## $ terms :Classes 'terms', 'formula' language height ~ factor(attitude)
## .. ..- attr(*, "variables")= language list(height, factor(attitude))
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "height" "factor(attitude)"
## .. .. .. ..$ : chr "factor(attitude)"
## .. ..- attr(*, "term.labels")= chr "factor(attitude)"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(height, factor(attitude))
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:2] "height" "factor(attitude)"
## $ model :'data.frame': 14 obs. of 2 variables:
## ..$ height : num [1:14] 180 165 168 175 165 168 175 170 165 163 ...
## ..$ factor(attitude): Factor w/ 3 levels "1","2","3": 3 2 1 1 1 3 3 3 2 2 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language height ~ factor(attitude)
## .. .. ..- attr(*, "variables")= language list(height, factor(attitude))
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "height" "factor(attitude)"
## .. .. .. .. ..$ : chr "factor(attitude)"
## .. .. ..- attr(*, "term.labels")= chr "factor(attitude)"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(height, factor(attitude))
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "factor"
## .. .. .. ..- attr(*, "names")= chr [1:2] "height" "factor(attitude)"
## - attr(*, "class")= chr [1:2] "aov" "lm"
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = height ~ factor(attitude), data = datafile)
##
## $`factor(attitude)`
## diff lwr upr p adj
## 2-1 -2.083333 -10.9858829 6.819216 0.8059400
## 3-1 7.916667 -0.9858829 16.819216 0.0827135
## 3-2 10.000000 0.2477456 19.752254 0.0444816
boxplot(height ~ attitude, data = datafile,
main = "Height by Attitude",
xlab = "Attitude",
ylab = "Height",
col = c("skyblue", "pink", "lightgreen"))## # A tibble: 5 × 6
## id gender height weight attitude BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 180 85 3 26.2
## 2 2 0 165 62 2 22.8
## 3 3 0 168 65 1 23.0
## 4 4 1 175 65 1 21.2
## 5 5 1 165 63 1 23.1
#数据分布
hist(datafile$weight,
main = "Weight Distribution",
xlab = "Weight",
ylab = "Frequency",
col = "skyblue")hist(datafile$height,
main = "Weight Distribution",
xlab = "Weight",
ylab = "Frequency",
col = "skyblue")# 描述统计量
N <- sapply(datafile, length)
Mean <- sapply(datafile, mean)
SD <- sapply(datafile, sd)
Min <- sapply(datafile, min)
Med <- sapply(datafile, median)
Max <- sapply(datafile, max)
result <- cbind(N, Mean, SD, Min, Med, Max)
result## N Mean SD Min Med Max
## id 14 7.500000 4.1833001 1.00000 7.50000 14.00000
## gender 14 0.500000 0.5188745 0.00000 0.50000 1.00000
## height 14 167.000000 6.3001831 158.00000 165.50000 180.00000
## weight 14 60.357143 10.0046692 45.00000 61.00000 85.00000
## attitude 14 1.857143 0.8644378 1.00000 2.00000 3.00000
## BMI 14 21.503286 2.1203676 18.02596 21.28123 26.23457
## # A tibble: 6 × 7
## id gender height weight attitude BMI cat
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1 1 180 85 3 26.2 1
## 2 2 0 165 62 2 22.8 1
## 3 3 0 168 65 1 23.0 1
## 4 4 1 175 65 1 21.2 1
## 5 5 1 165 63 1 23.1 1
## 6 6 1 168 60 3 21.3 0
levels(datafile$cat) <- c("light", "heavy")
boxplot(height ~ cat, data = datafile,
main = "Height by Weight Category",
xlab = "Weight Category",
ylab = "Height",
col = c("skyblue", "pink"))##
## Call:
## lm(formula = weight ~ height, data = datafile)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9114 -1.8930 0.0944 2.8386 5.8483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -181.0808 31.6915 -5.714 9.70e-05 ***
## height 1.4457 0.1896 7.623 6.14e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.308 on 12 degrees of freedom
## Multiple R-squared: 0.8289, Adjusted R-squared: 0.8146
## F-statistic: 58.12 on 1 and 12 DF, p-value: 6.138e-06
# 模拟1000抽样80%的数据
nsimu <- 1000
ss <- nrow(datafile)
ss0 <- round(ss*0.8)
R2 <- rep(0,nsimu)
for (i in 1:nsimu) {
index <- sample(1:ss, ss0)
model <- lm(weight ~ height, data = datafile[index,])
y.hat <- predict(model, datafile[-index,3])
y.true <- datafile$weight[-index]
R2[i] <- 1 - sum((y.true - y.hat)^2)/sum((y.true - mean(y.true))^2)
R2[i] <- R2[i] * 100
}
head(R2)## [1] 67.01724 77.41251 95.68766 58.71462 77.13235 -30.27099
## height gender
## 1.871503 1.871503
# 模型的AIC,BIC
# 用于评估模型的优劣,特别是在平衡模型的拟合程度和复杂度时
# AIC()与step()中的AIC()结果不一样,相差一个常数nln(2pi)+n+2
model.aic <- step(model)## Start: AIC=44.72
## weight ~ height + gender
##
## Df Sum of Sq RSS AIC
## - gender 1 0.18 222.69 42.734
## <none> 222.52 44.723
## - height 1 562.62 785.14 60.375
##
## Step: AIC=42.73
## weight ~ height
##
## Df Sum of Sq RSS AIC
## <none> 222.69 42.734
## - height 1 1078.5 1301.21 65.448
## Start: AIC=46.64
## weight ~ height + gender
##
## Df Sum of Sq RSS AIC
## - gender 1 0.18 222.69 44.013
## <none> 222.52 46.641
## - height 1 562.62 785.14 61.653
##
## Step: AIC=44.01
## weight ~ height
##
## Df Sum of Sq RSS AIC
## <none> 222.69 44.013
## - height 1 1078.5 1301.21 66.087
##
## Call:
## lm(formula = weight ~ height, data = datafile)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9114 -1.8930 0.0944 2.8386 5.8483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -181.0808 31.6915 -5.714 9.70e-05 ***
## height 1.4457 0.1896 7.623 6.14e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.308 on 12 degrees of freedom
## Multiple R-squared: 0.8289, Adjusted R-squared: 0.8146
## F-statistic: 58.12 on 1 and 12 DF, p-value: 6.138e-06
## [1] 44.72337
## [1] 86.45365
## [1] 41.73028
## id
## 41.73028
# 人群细分
datafile$height_new <- as.factor((datafile$height > median(datafile$height))*1)
datafile$gender_new <- as.factor((datafile$gender > median(datafile$gender))*1)
table(datafile$height_new, datafile$gender_new)/nrow(datafile)##
## 0 1
## 0 0.3571429 0.1428571
## 1 0.1428571 0.3571429
##
## 0 1
## 7 7
## [1] TRUE
## district bedrooms halls area floor subway school price
## 1 丰台 2 2 109.38765 low 0 0 4.920575
## 2 石景山 2 1 55.01044 high 1 0 3.369027
## 3 石景山 2 1 79.69252 high 1 0 3.769788
## 4 西城 2 1 52.47666 high 1 1 8.319766
## 5 东城 1 1 50.81697 low 1 1 8.077537
## 6 西城 2 0 67.15277 middle 1 1 8.131468
## district bedrooms halls area floor subway school price
## 1 丰台 2 2 109.38765 low 0 0 4.920575
## 2 石景山 2 1 55.01044 high 1 0 3.369027
## 3 石景山 2 1 79.69252 high 1 0 3.769788
## 4 西城 2 1 52.47666 high 1 1 8.319766
## 5 东城 1 1 50.81697 low 1 1 8.077537
# 因变量原始及对数变换后的直方图
par(mfrow = c(1, 2))
hist(house$price,
main = "Price Distribution",
xlab = "Price",
ylab = "Frequency",
col = "skyblue")
hist(log(house$price),
main = "Log Price Distribution",
xlab = "Log Price",
ylab = "Frequency",
col = "skyblue")# 连续性自变量的直方图
par(mfrow = c(1, 1))
hist(house$area,
main = "Area Distribution",
xlab = "Area",
ylab = "Frequency",
col = "skyblue")# 按行政区划分对价格的对数进行描述性统计并排序
library(tidyverse)
library(stringi)
house %>%
group_by(district) %>%
summarise(
n = n(),
mean = mean(log(price), na.rm = TRUE),
sd = sd(log(price), na.rm = TRUE),
min = min(log(price), na.rm = TRUE),
med = median(log(price), na.rm = TRUE),
max = max(log(price), na.rm = TRUE)
) %>%
mutate(pinyin = stri_trans_general(district, "Latin")) %>% # 转换为拼音
arrange(pinyin) %>% # 按拼音排序,若降序用desc()
select(-pinyin) # 去除临时拼音列## # A tibble: 6 × 7
## district n mean sd min med max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 朝阳 2624 1.73 0.231 0.996 1.73 2.59
## 2 东城 2600 2.01 0.261 0.897 2.07 2.73
## 3 丰台 2721 1.54 0.195 0.694 1.54 2.25
## 4 海淀 2687 1.97 0.239 1.09 1.98 2.64
## 5 石景山 1806 1.48 0.224 0.744 1.45 2.34
## 6 西城 2562 2.18 0.231 0.879 2.20 2.76
# 建立全模型a
# 将基准类别设置为 "西城"
house$district <- relevel(factor(house$district), ref = "朝阳")
# 另一种方法
# house$district <- factor(house$district, levels = c("朝阳", "西城", "东城", "丰台", "海淀", "石景山"))
model.a <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = house)
summary(model.a)##
## Call:
## lm(formula = log(price) ~ district * subway + school + floor +
## as.factor(bedrooms) + as.factor(halls) + area, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2292 -0.1327 -0.0015 0.1372 0.8874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.563e+00 1.581e-02 98.858 < 2e-16 ***
## district东城 1.290e-01 2.222e-02 5.802 6.68e-09 ***
## district丰台 -1.059e-01 1.516e-02 -6.987 2.92e-12 ***
## district海淀 2.082e-01 1.558e-02 13.366 < 2e-16 ***
## district石景山 -1.110e-01 1.517e-02 -7.314 2.73e-13 ***
## district西城 4.453e-01 1.810e-02 24.606 < 2e-16 ***
## subway 1.506e-01 1.341e-02 11.228 < 2e-16 ***
## school 1.564e-01 4.316e-03 36.235 < 2e-16 ***
## floorlow 3.151e-02 4.300e-03 7.328 2.45e-13 ***
## floormiddle 2.468e-02 4.197e-03 5.880 4.18e-09 ***
## as.factor(bedrooms)2 9.598e-03 4.940e-03 1.943 0.052028 .
## as.factor(bedrooms)3 2.712e-02 6.511e-03 4.165 3.13e-05 ***
## as.factor(bedrooms)4 6.054e-02 1.248e-02 4.853 1.23e-06 ***
## as.factor(bedrooms)5 1.127e-01 2.434e-02 4.632 3.65e-06 ***
## as.factor(halls)1 2.930e-02 8.396e-03 3.490 0.000484 ***
## as.factor(halls)2 1.257e-01 9.842e-03 12.771 < 2e-16 ***
## as.factor(halls)3 1.546e-01 2.722e-02 5.681 1.37e-08 ***
## area -1.014e-03 7.084e-05 -14.315 < 2e-16 ***
## district东城:subway 1.242e-01 2.306e-02 5.384 7.38e-08 ***
## district丰台:subway -4.341e-02 1.640e-02 -2.647 0.008138 **
## district海淀:subway 8.668e-03 1.683e-02 0.515 0.606458
## district石景山:subway -1.013e-01 1.691e-02 -5.988 2.18e-09 ***
## district西城:subway -4.869e-02 1.901e-02 -2.561 0.010440 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.212 on 14977 degrees of freedom
## Multiple R-squared: 0.6097, Adjusted R-squared: 0.6091
## F-statistic: 1063 on 22 and 14977 DF, p-value: < 2.2e-16
# 建立对照模型b
model.b <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + area, data = house)
summary(model.b)##
## Call:
## lm(formula = log(price) ~ district * subway + school + floor +
## as.factor(bedrooms) + area, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2359 -0.1361 -0.0025 0.1397 0.9058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.571e+00 1.444e-02 108.797 < 2e-16 ***
## district东城 1.089e-01 2.246e-02 4.851 1.24e-06 ***
## district丰台 -1.188e-01 1.532e-02 -7.758 9.17e-15 ***
## district海淀 1.975e-01 1.575e-02 12.540 < 2e-16 ***
## district石景山 -1.248e-01 1.533e-02 -8.139 4.28e-16 ***
## district西城 4.236e-01 1.827e-02 23.189 < 2e-16 ***
## subway 1.404e-01 1.356e-02 10.355 < 2e-16 ***
## school 1.573e-01 4.361e-03 36.068 < 2e-16 ***
## floorlow 3.140e-02 4.349e-03 7.220 5.43e-13 ***
## floormiddle 2.478e-02 4.244e-03 5.839 5.36e-09 ***
## as.factor(bedrooms)2 1.225e-02 4.876e-03 2.512 0.012005 *
## as.factor(bedrooms)3 2.905e-02 6.551e-03 4.435 9.28e-06 ***
## as.factor(bedrooms)4 4.921e-02 1.258e-02 3.913 9.17e-05 ***
## as.factor(bedrooms)5 8.488e-02 2.427e-02 3.497 0.000473 ***
## area -3.326e-04 6.154e-05 -5.406 6.56e-08 ***
## district东城:subway 1.284e-01 2.333e-02 5.504 3.77e-08 ***
## district丰台:subway -3.577e-02 1.659e-02 -2.157 0.031059 *
## district海淀:subway 1.013e-02 1.702e-02 0.595 0.551875
## district石景山:subway -9.458e-02 1.710e-02 -5.530 3.26e-08 ***
## district西城:subway -3.991e-02 1.922e-02 -2.077 0.037864 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2144 on 14980 degrees of freedom
## Multiple R-squared: 0.6005, Adjusted R-squared: 0.6
## F-statistic: 1185 on 19 and 14980 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Model 1: log(price) ~ district * subway + school + floor + as.factor(bedrooms) +
## area
## Model 2: log(price) ~ district * subway + school + floor + as.factor(bedrooms) +
## as.factor(halls) + area
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 14980 688.72
## 2 14977 672.85 3 15.866 117.72 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 考察行政区划与有无地铁的交互作用
model.c <- lm(log(price) ~ district + subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = house)
anova(model.c, model.a)## Analysis of Variance Table
##
## Model 1: log(price) ~ district + subway + school + floor + as.factor(bedrooms) +
## as.factor(halls) + area
## Model 2: log(price) ~ district * subway + school + floor + as.factor(bedrooms) +
## as.factor(halls) + area
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 14982 679.04
## 2 14977 672.85 5 6.19 27.557 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## AIC与BIC模型选择
housenew <- house
ss <- nrow(housenew)
housenew$bedrooms <- floor(6*runif(ss))
head(housenew)## district bedrooms halls area floor subway school price
## 1 丰台 0 2 109.38765 low 0 0 4.920575
## 2 石景山 0 1 55.01044 high 1 0 3.369027
## 3 石景山 1 1 79.69252 high 1 0 3.769788
## 4 西城 5 1 52.47666 high 1 1 8.319766
## 5 东城 1 1 50.81697 low 1 1 8.077537
## 6 西城 3 0 67.15277 middle 1 1 8.131468
model.s <- lm(log(price) ~ district*subway + school + floor + as.factor(bedrooms) + as.factor(halls) + area, data = housenew)
summary(model.s)##
## Call:
## lm(formula = log(price) ~ district * subway + school + floor +
## as.factor(bedrooms) + as.factor(halls) + area, data = housenew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22834 -0.13350 -0.00118 0.13832 0.87451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.556e+00 1.617e-02 96.222 < 2e-16 ***
## district东城 1.287e-01 2.225e-02 5.785 7.40e-09 ***
## district丰台 -1.061e-01 1.518e-02 -6.994 2.78e-12 ***
## district海淀 2.091e-01 1.559e-02 13.410 < 2e-16 ***
## district石景山 -1.101e-01 1.519e-02 -7.244 4.58e-13 ***
## district西城 4.460e-01 1.811e-02 24.620 < 2e-16 ***
## subway 1.495e-01 1.343e-02 11.133 < 2e-16 ***
## school 1.577e-01 4.313e-03 36.568 < 2e-16 ***
## floorlow 2.978e-02 4.296e-03 6.933 4.30e-12 ***
## floormiddle 2.283e-02 4.190e-03 5.449 5.15e-08 ***
## as.factor(bedrooms)1 -6.244e-03 5.962e-03 -1.047 0.294948
## as.factor(bedrooms)2 -3.090e-03 5.980e-03 -0.517 0.605359
## as.factor(bedrooms)3 -3.404e-03 6.039e-03 -0.564 0.572948
## as.factor(bedrooms)4 3.996e-03 5.956e-03 0.671 0.502298
## as.factor(bedrooms)5 -1.897e-03 6.028e-03 -0.315 0.753036
## as.factor(halls)1 2.889e-02 8.175e-03 3.534 0.000411 ***
## as.factor(halls)2 1.232e-01 9.630e-03 12.790 < 2e-16 ***
## as.factor(halls)3 1.683e-01 2.697e-02 6.239 4.51e-10 ***
## area -7.394e-04 5.313e-05 -13.916 < 2e-16 ***
## district东城:subway 1.244e-01 2.309e-02 5.389 7.19e-08 ***
## district丰台:subway -4.169e-02 1.642e-02 -2.539 0.011138 *
## district海淀:subway 1.078e-02 1.684e-02 0.640 0.522182
## district石景山:subway -9.935e-02 1.693e-02 -5.868 4.52e-09 ***
## district西城:subway -4.609e-02 1.903e-02 -2.422 0.015434 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2122 on 14976 degrees of freedom
## Multiple R-squared: 0.6088, Adjusted R-squared: 0.6082
## F-statistic: 1013 on 23 and 14976 DF, p-value: < 2.2e-16
model.aic <- step(model.s, trace = F)
model.bic <- step(model.s, k = log(ss), trace = F)
summary(model.aic)##
## Call:
## lm(formula = log(price) ~ district + subway + school + floor +
## as.factor(halls) + area + district:subway, data = housenew)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22977 -0.13372 -0.00062 0.13812 0.87294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.554e+00 1.569e-02 99.074 < 2e-16 ***
## district东城 1.286e-01 2.225e-02 5.780 7.60e-09 ***
## district丰台 -1.064e-01 1.517e-02 -7.013 2.44e-12 ***
## district海淀 2.090e-01 1.559e-02 13.404 < 2e-16 ***
## district石景山 -1.103e-01 1.519e-02 -7.263 3.96e-13 ***
## district西城 4.459e-01 1.811e-02 24.624 < 2e-16 ***
## subway 1.493e-01 1.343e-02 11.118 < 2e-16 ***
## school 1.578e-01 4.313e-03 36.584 < 2e-16 ***
## floorlow 2.979e-02 4.295e-03 6.937 4.18e-12 ***
## floormiddle 2.287e-02 4.189e-03 5.460 4.84e-08 ***
## as.factor(halls)1 2.875e-02 8.171e-03 3.518 0.000436 ***
## as.factor(halls)2 1.230e-01 9.628e-03 12.778 < 2e-16 ***
## as.factor(halls)3 1.683e-01 2.697e-02 6.241 4.46e-10 ***
## area -7.393e-04 5.312e-05 -13.918 < 2e-16 ***
## district东城:subway 1.246e-01 2.308e-02 5.397 6.87e-08 ***
## district丰台:subway -4.132e-02 1.642e-02 -2.517 0.011847 *
## district海淀:subway 1.084e-02 1.684e-02 0.644 0.519578
## district石景山:subway -9.905e-02 1.693e-02 -5.852 4.97e-09 ***
## district西城:subway -4.605e-02 1.902e-02 -2.421 0.015484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2122 on 14981 degrees of freedom
## Multiple R-squared: 0.6087, Adjusted R-squared: 0.6082
## F-statistic: 1295 on 18 and 14981 DF, p-value: < 2.2e-16
## [1] TRUE
## ARA ASSET ATO ROA GROWTH LEV SHARE ST
## 1 0.19230963 19.85605 0.0052 0.087709802 -0.9507273 0.4458801 26.89 0
## 2 0.22011996 20.91086 0.0056 0.016820383 -0.9426563 0.3986864 39.62 0
## 3 0.32529169 19.35262 0.0166 0.042468332 -0.9374404 0.3033481 26.46 0
## 4 0.02572868 21.43893 0.0028 0.018151630 -0.8529953 0.7582502 60.16 0
## 5 0.53359089 21.61334 0.2552 0.004146607 -0.8167039 0.7268753 54.24 1
## 6 0.06127521 21.04117 0.1248 0.051080619 -0.8102884 0.4017503 57.14 0
## [1] 684 8
## [1] 36
## [1] 0.05263158
par(mfrow = c(2, 3))
hist(logitdata$ASSET,
main = "对数资产归模",
xlab = "ASSET",
ylab = "Frequency",
col = "skyblue")
hist(logitdata$ATO,
main = "资产周转率",
xlab = "ATO",
ylab = "Frequency",
col = "skyblue")
hist(logitdata$ROA,
main = "资产回报率",
xlab = "ROA",
ylab = "Frequency",
col = "skyblue")
hist(logitdata$GROWTH,
main = "销售收入增长率",
xlab = "GROWTH",
ylab = "Frequency",
col = "skyblue")
hist(logitdata$LEV,
main = "杠杆率",
xlab = "LEV",
ylab = "Frequency",
col = "skyblue")
hist(logitdata$SHARE,
main = "第一大股东持股比率",
xlab = "SHARE",
ylab = "Frequency",
col = "skyblue")# 因变量:是否ST
par(mfrow = c(1, 1))
boxplot(ARA ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "应收账款占比",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))par(mfrow = c(2, 3))
boxplot(ASSET ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "对数资产规模",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))
boxplot(ATO ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "资产周转率",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))
boxplot(ROA ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "资产回报率",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))
boxplot(GROWTH ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "销售收入增长率",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))
boxplot(LEV ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "杠杆水平",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))
boxplot(SHARE ~ ST, data = logitdata,
xlab = "是否ST",
ylab = "第一大股东持股比率",
names = c("非ST", "ST"),
col = c("skyblue", "pink"))# 因变量、自变量的描述性统计
N <- sapply(logitdata, length)
Mean <- sapply(logitdata, mean)
SD <- sapply(logitdata, sd)
Min <- sapply(logitdata, min)
Med <- sapply(logitdata, median)
Max <- sapply(logitdata, max)
result <- cbind(N, Mean, SD, Min, Med, Max)
result## N Mean SD Min Med Max
## ARA 684 0.09504945 0.09228931 0.00000000 0.06832718 0.6346842
## ASSET 684 20.77785347 0.83352322 18.66070036 20.70050279 24.0176107
## ATO 684 0.51977383 0.36282648 0.00280000 0.43340000 3.1513000
## ROA 684 0.05587011 0.03859391 0.00008170 0.05125798 0.3111300
## GROWTH 684 0.11525745 0.30702005 -0.95072732 0.10228264 0.9985565
## LEV 684 0.40606356 0.16576397 0.01843107 0.40673974 0.9803218
## SHARE 684 46.03451754 17.68437717 4.16000000 44.95500000 88.5800000
## ST 684 0.05263158 0.22346029 0.00000000 0.00000000 1.0000000
# 建立logistic回归模型
model.full <- glm(ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE, data = logitdata, family = binomial(link = "logit"))
summary(model.full)##
## Call:
## glm(formula = ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE,
## family = binomial(link = "logit"), data = logitdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.86924 4.63586 -1.913 0.05573 .
## ARA 4.87974 1.49245 3.270 0.00108 **
## ASSET 0.24660 0.22409 1.100 0.27115
## ATO -0.50738 0.65744 -0.772 0.44026
## ROA -0.63661 6.22354 -0.102 0.91853
## GROWTH -0.83335 0.56706 -1.470 0.14167
## LEV 2.35415 1.20138 1.960 0.05005 .
## SHARE -0.01111 0.01115 -0.997 0.31891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 282.07 on 683 degrees of freedom
## Residual deviance: 251.51 on 676 degrees of freedom
## AIC: 267.51
##
## Number of Fisher Scoring iterations: 6
## [1] 7.493111e-05
## [1] 267.5057 303.7293
##
## Call:
## glm(formula = ST ~ ARA + GROWTH + LEV, family = binomial(link = "logit"),
## data = logitdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.6022 0.5646 -8.152 3.58e-16 ***
## ARA 5.1301 1.4341 3.577 0.000347 ***
## GROWTH -0.9061 0.5471 -1.656 0.097708 .
## LEV 2.5501 1.0778 2.366 0.017986 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 282.07 on 683 degrees of freedom
## Residual deviance: 254.04 on 680 degrees of freedom
## AIC: 262.04
##
## Number of Fisher Scoring iterations: 6
##
## Call:
## glm(formula = ST ~ ARA, family = binomial(link = "logit"), data = logitdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6834 0.2722 -13.531 < 2e-16 ***
## ARA 6.3316 1.3132 4.821 1.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 282.07 on 683 degrees of freedom
## Residual deviance: 261.71 on 682 degrees of freedom
## AIC: 265.71
##
## Number of Fisher Scoring iterations: 6
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.77
## Area under the curve: 0.7738
## Area under the curve: 0.671
par(mfrow = c(1, 1))
plot(roc.full, col = "skyblue", lwd = 2, main = "ROC Curve")
plot(roc.aic, col = "pink", lwd = 2, add = TRUE)
plot(roc.bic, col = "lightgreen", lwd = 2, add = TRUE)
legend("bottomright", legend = c("Full Model", "AIC Model", "BIC Model"), col = c("skyblue", "pink", "lightgreen"), lwd = 2)# 基于外样本的模型精度评估
# 重复100次,每次随机抽取80%的数据作为训练集,20%的数据作为测试集
nsimu <- 100
ss <- nrow(logitdata)
ss0 <- round(ss*0.8)
auc.full <- rep(0, nsimu)
AUC <- as.data.frame(matrix(0, nrow = nsimu, ncol = 3))
names(AUC) <- c("Full", "AIC", "BIC")
for (i in 1:nsimu) {
index <- sample(1:ss, ss0)
model.full <- glm(ST ~ ARA + ASSET + ATO + ROA + GROWTH + LEV + SHARE, data = logitdata[index,], family = binomial(link = "logit"))
model.aic <- glm(ST ~ ARA + GROWTH + LEV, data = logitdata[index,], family = binomial(link = "logit"))
model.bic <- glm(ST ~ ARA, data = logitdata[index,], family = binomial(link = "logit"))
roc.full <- roc(logitdata$ST[-index], predict(model.full, logitdata[-index,], type = "response"), quiet = TRUE)
roc.aic <- roc(logitdata$ST[-index], predict(model.aic, logitdata[-index,], type = "response"), quiet = TRUE)
roc.bic <- roc(logitdata$ST[-index], predict(model.bic, logitdata[-index,], type = "response"), quiet = TRUE)
AUC[i,] <- c(roc.full$auc, roc.aic$auc, roc.bic$auc)
}
boxplot(AUC, main = "外样本AUC对比", col = c("skyblue", "pink", "lightgreen"))## gender usage credit loan history accounts status
## 1 女性 0.9900680 3000 0 0 0 0
## 2 女性 1.3578084 3000 0 7 3 1
## 3 男性 1.2766882 5000 0 7 2 3
## 4 男性 0.7242874 3000 1745 0 0 0
## 5 男性 1.0739191 2000 0 0 11 0
## 6 女性 1.1730013 3000 0 0 2 3
## gender usage credit loan history accounts status Y1 Y2
## 1 女性 0.9900680 3000 0 0 0 0 0 0
## 2 女性 1.3578084 3000 0 7 3 1 1 1
## 3 男性 1.2766882 5000 0 7 2 3 1 3
## 4 男性 0.7242874 3000 1745 0 0 0 0 0
## 5 男性 1.0739191 2000 0 0 11 0 0 0
## 6 女性 1.1730013 3000 0 0 2 3 1 3
## [1] 8000 9
## [1] 0.61475
## gender usage credit loan history accounts status Y1 Y2
## 2 女性 1.3578084 3000 0 7 3 1 1 1
## 3 男性 1.2766882 5000 0 7 2 3 1 3
## 6 女性 1.1730013 3000 0 0 2 3 1 3
## 7 男性 1.0302797 5000 0 0 5 6 1 6
## 8 女性 0.9457060 4000 0 2 2 1 1 1
## 10 男性 0.6010362 3000 0 0 1 2 1 2
# 分析Y1=1时的不同类型
library(tidyverse)
table(orderdata1$Y2) %>%
barplot(main = "Y2分布",
xlab = "逾期严重程度",
ylab = "频数",
las = 1,
col = "skyblue")# 合并部分类
orderdata$Y2 <- 4*(orderdata$Y2>4) + orderdata$Y2 *(orderdata$Y2 <= 4)
orderdata1 <- orderdata[orderdata$Y1 == 1,]
head(orderdata1)## gender usage credit loan history accounts status Y1 Y2
## 2 女性 1.3578084 3000 0 7 3 1 1 1
## 3 男性 1.2766882 5000 0 7 2 3 1 3
## 6 女性 1.1730013 3000 0 0 2 3 1 3
## 7 男性 1.0302797 5000 0 0 5 6 1 4
## 8 女性 0.9457060 4000 0 2 2 1 1 1
## 10 男性 0.6010362 3000 0 0 1 2 1 2
##
## 1 2 3 4
## 1189 2018 1110 601
# 对新的Y2重绘制图
table(orderdata1$Y2) %>%
barplot(main = "Y2分布",
xlab = "逾期严重程度",
ylab = "频数",
las = 1,
col = "skyblue")##
## 女性 男性
## 2544 5456
##
## 女性 男性
## 0.318 0.682
##
## 女性 男性
## 0.318 0.682
# usage:信用卡相对使用率的分布
hist(orderdata$usage,
main = "信用卡相对使用率",
xlab = "usage",
ylab = "频数",
las = 1,
col = "skyblue")## gender usage credit loan history accounts status Y1 Y2
## 903 男性 6.074610 50000 17268 8 4 3 1 3
## 1333 男性 4.241007 3000 25571 2 4 4 1 4
## 2347 男性 4.683206 10000 6811 9 1 1 1 1
## 2357 女性 6.024418 12000 13799 2 18 1 1 1
## 3135 男性 5.095060 1000 1774 0 4 0 0 0
## 3722 男性 4.293418 1000 0 3 12 2 1 2
## 3805 男性 5.271930 1000 0 0 4 4 1 4
## 6148 女性 13.088921 10000 6550 0 3 3 1 3
## 7069 男性 4.597112 1000 0 0 4 0 0 0
# 定义正常使用率usage=1与非正常使用率usage=0
orderdata$usage <- 1*(orderdata$usage <= 1)
tab <- table(orderdata$usage)
tab##
## 0 1
## 3329 4671
##
## 0 1
## 0.416125 0.583875
##
## 0 1
## 0.416125 0.583875
# credit:信用额度的分布
hist(orderdata$credit,
main = "信用额度",
xlab = "credit",
ylab = "频数",
las = 1,
col = "skyblue")# 对数变换
orderdata$credit <- log(orderdata$credit)
hist(orderdata$credit,
main = "对数信用额度",
xlab = "log(credit)",
ylab = "频数",
las = 1,
col = "skyblue")# 定义两个与loan有关的两个解释变量,有无及金额
orderdata$Z1 <- 1*(orderdata$loan > 0)
orderdata$Z2 <- orderdata$loan
orderdata[orderdata$loan > 0,]$Z2 <- log(orderdata[orderdata$loan > 0,]$loan)
head(orderdata)## gender usage credit loan history accounts status Y1 Y2 Z1 Z2
## 1 女性 1 8.006368 0 0 0 0 0 0 0 0.00000
## 2 女性 0 8.006368 0 7 3 1 1 1 0 0.00000
## 3 男性 0 8.517193 0 7 2 3 1 3 0 0.00000
## 4 男性 1 8.006368 1745 0 0 0 0 0 1 7.46451
## 5 男性 0 7.600902 0 0 11 0 0 0 0 0.00000
## 6 女性 0 8.006368 0 0 2 3 1 3 0 0.00000
## [1] 0.169625
# 有房贷的金额分布
hist(orderdata[orderdata$Z1 == 1,]$Z2,
main = "有房贷者金额分布",
xlab = "对数房贷月供",
ylab = "频数",
las = 1,
col = "skyblue")# history历史逾期次数的分布
hist(orderdata$history,
main = "历史逾期次数分布",
xlab = "历史逾期次数",
ylab = "频数",
las = 1,
col = "skyblue")## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 3.365 4.000 67.000
hist_data <- hist(orderdata$account,
main = "信用卡开卡数分布",
xlab = "信用卡开卡数",
ylab = "频数",
ylim = c(0, 7000),
las = 1,
col = "skyblue")
# 添加频数值到图上
text(
x = hist_data$mids, # 每个柱子的中间位置
y = hist_data$counts, # 每个柱子的高度
labels = hist_data$counts, # 频数值
pos = 3, # 标签放在柱子顶部
cex = 0.8, # 调整字体大小
col = "red" # 设置字体颜色
)# 提取数据并转换为数据框
hist_table <- data.frame(
Bin_Start = hist_data$breaks[-length(hist_data$breaks)], # 每个区间的起点
Bin_End = hist_data$breaks[-1], # 每个区间的终点
Frequency = hist_data$counts # 每个区间的频数
)
# 显示表格
hist_table## Bin_Start Bin_End Frequency
## 1 0 5 6420
## 2 5 10 992
## 3 10 15 391
## 4 15 20 127
## 5 20 25 36
## 6 25 30 19
## 7 30 35 3
## 8 35 40 7
## 9 40 45 1
## 10 45 50 2
## 11 50 55 1
## 12 55 60 0
## 13 60 65 0
## 14 65 70 1
##
## 0 1
## 女性 0.4465409 0.5534591
## 男性 0.3566716 0.6433284
##
## 0 1
## 0 0.2820667 0.7179333
## 1 0.4587883 0.5412117
##
## 0 1
## 0 0.3605299 0.6394701
## 1 0.5062638 0.4937362
## 女性 男性
## 0.5534591 0.6433284
## 0 1
## 0.7179333 0.5412117
## 0 1
## 0.6394701 0.4937362
# Y1与连续型X作箱式图
par(mfrow = c(1, 3))
orderdata %>%
boxplot(credit ~ Y1,
.,
xlab = "是否逾期",
ylab = "对数授信额度",
names = c("非逾期", "逾期"),
col = c("skyblue", "pink"))
orderdata %>%
boxplot(history ~ Y1,
.,
xlab = "是否逾期",
ylab = "历史逾期次数",
names = c("非逾期", "逾期"),
col = c("skyblue", "pink"))
orderdata %>%
boxplot(accounts ~ Y1,
.,
xlab = "是否逾期",
ylab = "信用卡开卡数",
names = c("非逾期", "逾期"),
col = c("skyblue", "pink"))# 有房贷的月供的对数对Y1做箱式图
par(mfrow = c(1, 1))
orderdata1 <- orderdata[orderdata$Z1 == 1,]
boxplot(Z2 ~ Y1,
orderdata1,
main = "有房贷者月供分布",
xlab = "是否逾期",
ylab = "对数房贷月供",
names = c("非逾期", "逾期"),
col = c("skyblue", "pink"))# 结合Y2对gender,usage,X1进行描述性统计
aa <- orderdata[orderdata$Y1 == 1,]
aa$sex <- 1*(aa$gender == "男性")
dim(aa)## [1] 4918 12
## gender usage credit loan history accounts status Y1 Y2 Z1 Z2 sex
## 2 女性 0 8.006368 0 7 3 1 1 1 0 0 0
## 3 男性 0 8.517193 0 7 2 3 1 3 0 0 1
## 6 女性 0 8.006368 0 0 2 3 1 3 0 0 0
## 7 男性 0 8.517193 0 0 5 6 1 4 0 0 1
## 8 女性 1 8.294050 0 2 2 1 1 1 0 0 0
## 10 男性 1 8.006368 0 0 1 2 1 2 0 0 1
par(mfrow = c(1,3))
plot(tapply(aa$sex, aa$Y2, mean),
type = "b",
xlab = "逾期严重程度",
ylab = "男性占比",
ylim = c(0, 1)
)
plot(tapply(aa$usage, aa$Y2, mean),
type = "b",
xlab = "逾期严重程度",
ylab = "正常使用占比",
ylim = c(0, 1)
)
plot(tapply(aa$Z1,aa$Y2, mean),
type = "b",
xlab = "逾期严重程度",
ylab = "房贷比例",
ylim = c(0, 1)
)# Y2与连续型X(对数授信额度、历史逾期次数、信用卡开户数、对数月供)作箱式图
par(mfrow = c(2, 2))
boxplot(credit ~ Y2,
aa,
xlab = "逾期严重程度",
ylab = "对数授信额度",
col = c("skyblue", "pink"))
boxplot(history ~ Y2,
aa,
xlab = "逾期严重程度",
ylab = "历史逾期次数",
col = c("skyblue", "pink"))
boxplot(accounts ~ Y2,
aa,
xlab = "逾期严重程度",
ylab = "信用卡开户数",
col = c("skyblue", "pink"))
aa1 <- aa[aa$loan > 0,]
boxplot(Z2 ~ Y2,
aa1,
xlab = "逾期严重程度",
ylab = "对数月供",
col = c("skyblue", "pink"))# 对是否逾期建立0-1回归分析
orderdata$gender <- factor(orderdata$gender, levels = c("男性", "女性"))
# orderdata$gender <- relevel(orderdata$gender, ref = "男性")
model.full <- glm(Y1 ~ gender + usage + credit + Z1 + Z2 + history + accounts , data = orderdata, family = binomial(link = "logit"))
summary(model.full)##
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z1 + Z2 + history +
## accounts, family = binomial(link = "logit"), data = orderdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.369341 0.385580 8.738 < 2e-16 ***
## gender女性 -0.261012 0.054073 -4.827 1.39e-06 ***
## usage -0.360500 0.056459 -6.385 1.71e-10 ***
## credit -0.378850 0.045810 -8.270 < 2e-16 ***
## Z1 0.401117 0.526887 0.761 0.4465
## Z2 -0.101542 0.066921 -1.517 0.1292
## history 0.605662 0.023140 26.174 < 2e-16 ***
## accounts 0.010548 0.006208 1.699 0.0893 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10665.2 on 7999 degrees of freedom
## Residual deviance: 8924.5 on 7992 degrees of freedom
## AIC: 8940.5
##
## Number of Fisher Scoring iterations: 5
## [1] 0
##
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z2 + history + accounts,
## family = binomial(link = "logit"), data = orderdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.403849 0.382845 8.891 < 2e-16 ***
## gender女性 -0.261165 0.054072 -4.830 1.37e-06 ***
## usage -0.360641 0.056457 -6.388 1.68e-10 ***
## credit -0.382747 0.045514 -8.409 < 2e-16 ***
## Z2 -0.051068 0.008937 -5.714 1.10e-08 ***
## history 0.605350 0.023130 26.172 < 2e-16 ***
## accounts 0.010323 0.006200 1.665 0.0959 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10665.2 on 7999 degrees of freedom
## Residual deviance: 8925.1 on 7993 degrees of freedom
## AIC: 8939.1
##
## Number of Fisher Scoring iterations: 5
## [1] 0
# 自动寻找最优的BIC模型
ss = length(orderdata$Y1)
model.bic <- step(model.full, k = log(ss), trace = F)
summary(model.bic)##
## Call:
## glm(formula = Y1 ~ gender + usage + credit + Z2 + history, family = binomial(link = "logit"),
## data = orderdata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.528699 0.375333 9.402 < 2e-16 ***
## gender女性 -0.264004 0.054045 -4.885 1.03e-06 ***
## usage -0.390036 0.053674 -7.267 3.68e-13 ***
## credit -0.391475 0.045190 -8.663 < 2e-16 ***
## Z2 -0.048709 0.008818 -5.524 3.32e-08 ***
## history 0.603212 0.023069 26.148 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10665.2 on 7999 degrees of freedom
## Residual deviance: 8927.9 on 7994 degrees of freedom
## AIC: 8939.9
##
## Number of Fisher Scoring iterations: 5
## [1] 0
# 对3个模型做内样本预测
pred.full <- predict(model.full, type = "response")
pred.aic <- predict(model.aic, type = "response")
pred.bic <- predict(model.bic, type = "response")
# 计算AUC
library(pROC)
roc.full <- roc(orderdata$Y1, pred.full)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [1] 0.7677270 0.7676767 0.7669658
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
aa$gender <- factor(aa$gender, levels = c("男性", "女性"))
probit.full <- polr(as.factor(Y2) ~ gender + usage + credit + Z1 + Z2 + history + accounts, method = "probit", data = aa, Hess = TRUE)
summary(probit.full)## Call:
## polr(formula = as.factor(Y2) ~ gender + usage + credit + Z1 +
## Z2 + history + accounts, data = aa, Hess = TRUE, method = "probit")
##
## Coefficients:
## Value Std. Error t value
## gender女性 -0.06715 0.034037 -1.973
## usage -0.02486 0.032584 -0.763
## credit -0.10441 0.029553 -3.533
## Z1 0.44756 0.365270 1.225
## Z2 -0.06693 0.046794 -1.430
## history 0.05429 0.005450 9.962
## accounts 0.02114 0.003504 6.032
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.4285 0.2461 -5.8037
## 2|3 -0.3110 0.2456 -1.2660
## 3|4 0.4829 0.2459 1.9634
##
## Residual Deviance: 12612.04
## AIC: 12632.04
# Y2有3个不同的截距项
tab <- as.data.frame(coefficients(summary(probit.full)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab## Value Std. Error t value p.value
## gender女性 -0.06714918 0.034037304 -1.972811 0.049
## usage -0.02486039 0.032584002 -0.762963 0.445
## credit -0.10440799 0.029553035 -3.532902 0.000
## Z1 0.44755705 0.365270145 1.225277 0.220
## Z2 -0.06692980 0.046793651 -1.430318 0.153
## history 0.05429390 0.005450082 9.962034 0.000
## accounts 0.02113707 0.003504275 6.031795 0.000
## 1|2 -1.42853910 0.246144151 -5.803669 0.000
## 2|3 -0.31095087 0.245608760 -1.266041 0.205
## 3|4 0.48289766 0.245944751 1.963440 0.050
# 建立空模型null model,只包含截距项
probit.null <- polr(as.factor(Y2) ~ 1, method = "probit", data = aa, Hess = TRUE)
summary(probit.null)## Call:
## polr(formula = as.factor(Y2) ~ 1, data = aa, Hess = TRUE, method = "probit")
##
## No coefficients
##
## Intercepts:
## Value Std. Error t value
## 1|2 -0.7007 0.0196 -35.8202
## 2|3 0.3910 0.0184 21.2785
## 3|4 1.1641 0.0230 50.5037
##
## Residual Deviance: 12802.75
## AIC: 12808.75
## [1] 0
## Call:
## polr(formula = as.factor(Y2) ~ gender + credit + Z2 + history +
## accounts, data = aa, Hess = TRUE, method = "probit")
##
## Coefficients:
## Value Std. Error t value
## gender女性 -0.06797 0.034029 -1.998
## credit -0.11018 0.029279 -3.763
## Z2 -0.01018 0.005985 -1.701
## history 0.05495 0.005364 10.244
## accounts 0.02171 0.003374 6.436
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.4607 0.2443 -5.9783
## 2|3 -0.3434 0.2438 -1.4087
## 3|4 0.4501 0.2441 1.8440
##
## Residual Deviance: 12614.12
## AIC: 12630.12
tab <- as.data.frame(coefficients(summary(probit.aic)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab## Value Std. Error t value p.value
## gender女性 -0.06797396 0.034028753 -1.997545 0.046
## credit -0.11018141 0.029278503 -3.763219 0.000
## Z2 -0.01017734 0.005984540 -1.700605 0.089
## history 0.05494772 0.005363807 10.244163 0.000
## accounts 0.02171399 0.003373886 6.435898 0.000
## 1|2 -1.46067265 0.244330975 -5.978254 0.000
## 2|3 -0.34339596 0.243768536 -1.408697 0.159
## 3|4 0.45012790 0.244097940 1.844046 0.065
## [1] 0
## BIC模型
ss2 <- length(aa$Y2)
probit.bic <- step(probit.full, k = log(ss2), trace = F)
summary(probit.bic)## Call:
## polr(formula = as.factor(Y2) ~ credit + history + accounts, data = aa,
## Hess = TRUE, method = "probit")
##
## Coefficients:
## Value Std. Error t value
## credit -0.12271 0.028166 -4.357
## history 0.05542 0.005358 10.345
## accounts 0.02112 0.003334 6.334
##
## Intercepts:
## Value Std. Error t value
## 1|2 -1.5344 0.2361 -6.5001
## 2|3 -0.4181 0.2354 -1.7760
## 3|4 0.3752 0.2358 1.5914
##
## Residual Deviance: 12621.05
## AIC: 12633.05
tab <- as.data.frame(coefficients(summary(probit.bic)))
tab$p.value <- round(2*(1-pnorm(abs(tab$`t value`))), 3)
tab## Value Std. Error t value p.value
## credit -0.12270807 0.028166195 -4.356572 0.000
## history 0.05542479 0.005357707 10.344871 0.000
## accounts 0.02111930 0.003334170 6.334198 0.000
## 1|2 -1.53438168 0.236054863 -6.500106 0.000
## 2|3 -0.41812594 0.235431197 -1.776001 0.076
## 3|4 0.37517650 0.235753519 1.591393 0.112
## [1] 0
# 分别以Y1和Y2为因变量的BIC模型进行预测
new <- data.frame(gender = "男性", usage = 1, credit = log(20000), loan = 4000, Z1 = 1, Z2 = log(4000), history = 4, accounts = 3)
pred <- predict(model.bic, new)
exp(pred)/(1+exp(pred))## 1
## 0.7808363
## [1] 2
## Levels: 1 2 3 4
## [1] 80000 6
## keyword impression click cost ranking conversion
## 1 飞机票价格怎么算 55 4 1.89 3.39 0
## 2 武汉特价飞机票 198 5 3.76 7.72 0
## 3 广州机票特价 6 1 0.98 2.00 0
## 4 昆明飞机票 205 1 0.63 10.56 0
## 5 北京机票网 21 1 0.89 4.82 0
## 6 武汉飞机 466 4 2.66 6.53 0
## keyword impression click cost ranking conversion kwlen
## 1 飞机票价格怎么算 55 4 1.89 3.39 0 8
## 2 武汉特价飞机票 198 5 3.76 7.72 0 7
## 3 广州机票特价 6 1 0.98 2.00 0 6
## 4 昆明飞机票 205 1 0.63 10.56 0 5
## 5 北京机票网 21 1 0.89 4.82 0 5
## Loading required package: jiebaRD
##
## Attaching package: 'jiebaR'
## The following object is masked from 'package:psych':
##
## distance
my.worker <- worker()
kw.seg <- segment(poisson$keyword, my.worker)
tab <- table(kw.seg)
tab <- sort(tab, decreasing = T)
tab2 <- tab[1:100]
tab2 <- tab2/sum(tab)
tab2## kw.seg
## 机票 飞机票 特价机票 查询 便宜 预订
## 0.143109670 0.076686612 0.049180103 0.047225140 0.033362262 0.027684648
## 打折 深圳 的 飞机 航班 特价
## 0.024857260 0.023742749 0.023249440 0.022678482 0.021225963 0.021157448
## 上海 广州 订机票 三亚 北京 网站
## 0.018448819 0.017101357 0.016886676 0.015826977 0.015699082 0.013977070
## 时刻表 网 厦门 预定 最 网上
## 0.013209702 0.013205134 0.011122277 0.011021788 0.010359476 0.009975791
## 买 南京 昆明 折扣 机票价格 哈尔滨
## 0.009879870 0.008144156 0.008112182 0.007952314 0.007646280 0.007427031
## 长沙 天津 武汉 海口 票价 机票网
## 0.007239757 0.007189513 0.007157539 0.007006806 0.006065866 0.005988215
## 订购 重庆 大连 北京机票 去 订票
## 0.005988215 0.005709588 0.005577125 0.005129493 0.005033572 0.004946787
## 哪个 乌鲁木齐 杭州 成都 价格 春节
## 0.004873704 0.004846298 0.004663591 0.004636185 0.004426072 0.004416937
## 西安 低价 郑州 多少钱 济南 订
## 0.004380396 0.004266204 0.004238798 0.004197689 0.003964738 0.003777463
## 定 呼和浩特 什么 哪里 长春 南昌
## 0.003676974 0.003640433 0.003539944 0.003521674 0.003444023 0.003275019
## 官网 廉价 往返机票 优惠机票 哪儿 烟台
## 0.003192801 0.003165395 0.003128854 0.003101448 0.002795414 0.002749737
## 五一 南宁 怎么 沈阳 太原 到
## 0.002640113 0.002640113 0.002571598 0.002283835 0.002256429 0.002215320
## 成都机票 吗 西双版纳 珠海 十一 最低
## 0.001936692 0.001904718 0.001881880 0.001840771 0.001763121 0.001758553
## 2013 时候 青岛 团购 国内机票 途牛网
## 0.001731147 0.001658064 0.001616955 0.001557575 0.001539305 0.001521034
## 特惠 航空 途牛 丽江 年 携程网
## 0.001466222 0.001333760 0.001310921 0.001306354 0.001274380 0.001269812
## 西安机票 哪网 电话 好 那个 儿童
## 0.001228703 0.001224136 0.001219568 0.001215000 0.001146485 0.001119079
## 往返 头等舱 民航 旅游
## 0.001109944 0.001096241 0.001082538 0.001068835
## [1] 0.9498972
# 生成新的解释变量,4水平的定性因子型变量ticket
ss <- length(poisson$keyword)
ticket <- rep("其他", ss)
kw <- poisson$keyword
pos <- grep("飞机票", kw)
ticket[pos] <- "飞机票"
kw <- gsub("飞机票", "-", kw)
pos <- grep("机票", kw)
ticket[pos] <- "机票"
kw <- gsub("机票", "-", kw)
pos <- grep("航班", kw)
ticket[pos] <- "航班"
kw <- gsub("航班", "-", kw)
head(ticket)## [1] "飞机票" "飞机票" "机票" "飞机票" "机票" "其他"
## [1] "character"
##
## 飞机票 航班 机票 其他
## 17658 4647 52401 5294
poisson$ticket <- ticket
# 生成新的解释变量,9水平的定性因子型变量price
kw.pattern <- c("特价", "便宜", "打折", "折扣", "低价", "廉价", "特惠", "最")
tmp <- rep("其他", ss)
kw <- poisson$keyword
for (i in 1:length(kw.pattern)) {
pos <- grep(kw.pattern[i], kw)
tmp[pos] <- kw.pattern[i]
kw <- gsub(kw.pattern[i], "-", kw)
}
tmp <- factor(tmp)
poisson$price <- tmp
table(tmp)## tmp
## 低价 便宜 其他 最 廉价 打折 折扣 特价 特惠
## 937 5042 47967 2762 693 5441 1460 15377 321
# 生成新的解释变量,10水平的定性因素型变量buy购买意愿
kw.pattern <- c("定", "订", "购", "买", "查询", "多少钱", "价格", "哪个", "时刻表")
tmp <- rep("其他", ss)
kw <- poisson$keyword
for (i in 1:length(kw.pattern)) {
pos <- grep(kw.pattern[i], kw)
tmp[pos] <- kw.pattern[i]
kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$buy <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table## tmp
## 查询 定 订 多少钱 购 价格 买 哪个 其他 时刻表
## 8954 3229 11066 919 2016 2643 2295 1067 44919 2892
# 生成新的解释变量,5水平的定性因素型变量city1一线城市
kw.pattern <- c("北京", "上海", "广州", "深圳")
tmp <- rep("其他", ss)
for (i in 1:length(kw.pattern)) {
pos <- grep(kw.pattern[i], kw)
tmp[pos] <- kw.pattern[i]
kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$city1 <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table## tmp
## 北京 广州 其他 上海 深圳
## 4543 3753 62476 4030 5198
# 生成新的解释变量,21水平的定性因素型变量city2热点城市
kw.pattern=c("三亚","厦门","南京","昆明","哈尔滨","天津",
"长沙","武汉","海口","重庆","大连","乌鲁木齐" ,"杭州",
"成都","西安","郑州","济南","呼和浩特","长春","南昌")
tmp <- rep("其他", ss)
for (i in 1:length(kw.pattern)) {
pos <- grep(kw.pattern[i], kw)
tmp[pos] <- kw.pattern[i]
kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$city2 <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table## tmp
## 成都 重庆 大连 哈尔滨 海口 杭州 呼和浩特 济南
## 1439 1250 1221 1626 1532 1021 797 868
## 昆明 南昌 南京 其他 三亚 厦门 天津 武汉
## 1776 717 1783 51337 3465 2471 1574 1567
## 乌鲁木齐 西安 长春 长沙 郑州
## 1061 1228 754 1585 928
# 生成新的解释变量,7水平的定性因素型变量brand广告主品牌
kw.pattern=c("携程","途牛","去哪儿","艺龙","同程","酷讯")
tmp <- rep("其他", ss)
for (i in 1:length(kw.pattern)) {
pos <- grep(kw.pattern[i], kw)
tmp[pos] <- kw.pattern[i]
kw <- gsub(kw.pattern[i], "-", kw)
}
poisson$brand <- tmp
freq_table <- table(tmp)
sorted_table <- freq_table[order(stringi::stri_trans_general(names(freq_table), "Han-Latin"))]
sorted_table## tmp
## 酷讯 其他 去哪儿 同程 途牛 携程 艺龙
## 149 77939 601 129 620 436 126
# 定义两个因变量Y1(有无转化)和Y2(转化次数)
poisson$Y1 <- 1*(poisson$conversion > 0)
poisson$Y2 <- poisson$Y1 *(poisson$conversion - 1 )
poisson$CTR <- poisson$click/poisson$impression * 100
poisson$CPC <- poisson$cost/poisson$click
# 有转化数据中Y2的直方图
aa <- poisson[poisson$Y1 == 1,]
hist(aa$conversion ,
xlab = "转化程度",
ylab = "频数",
col = "skyblue")## keyword impression click cost ranking conversion kwlen ticket
## 6739 机票预订 32771 1302 1916.41 8.09 95 4 机票
## 8148 网上预订机票 1537 206 164.18 7.67 64 6 机票
## 11543 机票 61480 1277 2100.01 3.95 62 2 机票
## 29079 机票预订 35349 1368 2017.65 8.43 92 4 机票
## 36229 网上预订机票 1293 409 341.06 5.51 70 6 机票
## 39722 网上预订机票 1499 218 172.30 6.94 62 6 机票
## 50506 网上预订机票 911 176 141.27 5.70 69 6 机票
## 75322 机票预订 92999 1107 1252.46 7.81 75 4 机票
## 75459 机票预订 31136 1105 1632.17 9.03 92 4 机票
## price buy city1 city2 brand Y1 Y2 CTR CPC
## 6739 其他 订 其他 其他 其他 1 94 3.973025 1.4718971
## 8148 其他 订 其他 其他 其他 1 63 13.402733 0.7969903
## 11543 其他 其他 其他 其他 其他 1 61 2.077098 1.6444871
## 29079 其他 订 其他 其他 其他 1 91 3.869982 1.4748904
## 36229 其他 订 其他 其他 其他 1 69 31.631864 0.8338875
## 39722 其他 订 其他 其他 其他 1 61 14.543029 0.7903670
## 50506 其他 订 其他 其他 其他 1 68 19.319429 0.8026705
## 75322 其他 订 其他 其他 其他 1 74 1.190335 1.1314002
## 75459 其他 订 其他 其他 其他 1 91 3.548947 1.4770769
# 对展现量、点击率及单位点击成本的对数做直方图
par(mfrow = c(1, 3))
hist(log(poisson$impression),
xlab = "展现量",
ylab = "频数",
col = "skyblue")
hist(log(poisson$CTR),
xlab = "点击率",
ylab = "频数",
col = "skyblue")
hist(log(poisson$CPC),
xlab = "单位点击成本",
ylab = "频数",
col = "skyblue")# 对关健词排名及关键词长度做直方图
par(mfrow = c(1, 2))
hist(poisson$ranking,
xlab = "排名",
ylab = "频数",
col = "skyblue")
hist(poisson$kwlen,
xlab = "关键词长度",
ylab = "频数",
col = "skyblue")# 对6个离散型变量做柱状图
# 机票表达方式、价格偏好表达方式、购买意愿表达方式、一线城市、热点城市、广告主品牌
par(mfrow = c(3, 2))
library(tidyverse)
tab = table(poisson$ticket)
tab##
## 其他 机票 航班 飞机票
## 5294 52401 4647 17658
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")
tab = table(poisson$price)
tab##
## 低价 便宜 其他 最 廉价 打折 折扣 特价 特惠
## 937 5042 47967 2762 693 5441 1460 15377 321
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")
tab = table(poisson$buy)
tab##
## 买 价格 其他 哪个 多少钱 定 时刻表 查询 订 购
## 2295 2643 44919 1067 919 3229 2892 8954 11066 2016
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")
tab = table(poisson$city1)
tab##
## 上海 其他 北京 广州 深圳
## 4030 62476 4543 3753 5198
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")
tab = table(poisson$city2)
tab##
## 三亚 乌鲁木齐 其他 南京 南昌 厦门 呼和浩特 哈尔滨
## 3465 1061 51337 1783 717 2471 797 1626
## 大连 天津 成都 昆明 杭州 武汉 济南 海口
## 1221 1574 1439 1776 1021 1567 868 1532
## 西安 郑州 重庆 长春 长沙
## 1228 928 1250 754 1585
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")
tab = table(poisson$brand)
tab##
## 其他 去哪儿 同程 携程 艺龙 途牛 酷讯
## 77939 601 129 436 126 620 149
# 将tab按各类别的拼音排序
tab = tab[names(tab) != "其他"]
tab[order(stringi::stri_trans_general(names(tab), "Han-Latin"))] %>%
barplot(
las = 1,
col = "skyblue")# 结合Y1,对3个连续型解释变量做箱式图
par(mfrow = c(1, 3))
boxplot(log(impression) ~ Y1,
poisson,
xlab = "是否转化",
ylab = "对数展现量",
names = c("未转化", "转化"),
col = c("skyblue", "pink"))
boxplot(log(CTR) ~ Y1,
poisson,
xlab = "是否转化",
ylab = "对数点击率",
names = c("未转化", "转化"),
col = c("skyblue", "pink"))
boxplot(log(CPC) ~ Y1,
poisson,
xlab = "是否转化",
ylab = "对数单位点击成本",
names = c("未转化", "转化"),
col = c("skyblue", "pink"))# Y1对排名、关键词长度的描述统计
par(mfrow = c(1, 2))
boxplot(ranking ~ Y1,
poisson,
xlab = "是否转化",
ylab = "排名",
names = c("未转化", "转化"),
col = c("skyblue", "pink"))
boxplot(poisson$kwlen ~ Y1,
poisson,
xlab = "是否转化",
ylab = "关键词长度",
names = c("未转化", "转化"),
col = c("skyblue", "pink"))# Y1 对6个离散型变量的描述统计,并作柱状图
par(mfrow = c(3, 2))
mu <- tapply(poisson$Y1, poisson$ticket, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)
mu <- tapply(poisson$Y1, poisson$price, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)
mu <- tapply(poisson$Y1, poisson$buy, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)
mu <- tapply(poisson$Y1, poisson$city1, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)
mu <- tapply(poisson$Y1, poisson$city2, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)
mu <- tapply(poisson$Y1, poisson$brand, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu)# 结合Y2(转化量-1)对3个连续型解释变量做柱形图
par(mfrow = c(1, 3))
Y2 <- aa$Y2
Z <- aa$impression
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))
mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "展现量", ylab = "转化程度", ylim = c(0, 5))
Z <- aa$CTR
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))
mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "点击率", ylab = "转化程度", ylim = c(0, 5))
Z <- aa$CPC
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("低", "高"))
mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "单位点击成本", ylab = "转化程度", ylim = c(0, 5))# Y2 对关键词排名、长度进行描述
par(mfrow = c(1,2))
Z <- aa$ranking
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("前", "后"))
mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "排名", ylab = "转化程度", ylim = c(0, 5))
Z <- aa$kwlen
Z <- 1*(Z > median(Z))
Z <- factor(Z, labels = c("短", "长"))
mu <- tapply(Y2, Z, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, xlab = "关键词长度", ylab = "转化程度", ylim = c(0, 5))# Y2 对6个离散型变量描述
par(mfrow = c(3,2))
mu <- tapply(aa$Y2, aa$ticket, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))
mu <- tapply(aa$Y2, aa$price, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))
mu <- tapply(aa$Y2, aa$buy, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))
mu <- tapply(aa$Y2, aa$city1, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))
mu <- tapply(aa$Y2, aa$city2, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))
mu <- tapply(aa$Y2, aa$brand, mean)
mu <- mu[order(stringi::stri_trans_general(names(mu), "Han-Latin"))]
barplot(mu, ylim = c(0,4))